In [1]:
import pandas as pd
from pandas import DataFrame
import os
import re
import numpy as np
import cPickle as pickle
from collections import Counter
from sklearn.manifold import TSNE
import nltk
one time methods only
In [18]:
def get_all_topics():
genre = np.array(['tech', 'politics', 'music', 'sports'])
tech = np.array(['@microsoft', 'nokia', 'amazon', 'amazon prime', 'amazon prime day', 'apple', 'apple watch', 'ipad', 'iphone', 'ipod', 'oracle', 'ibm', 'nintendo', 'moto g', 'google', 'google +', 'ps4', 'netflix'])
politics = np.array(['angela merkel', 'bernie sanders', 'david cameron',' donald trump', 'hillary', 'joe biden', 'michelle obama', 'obama', 'rahul gandhi', 'tony blair'])
music = np.array(['bee gees', 'beyonce', 'bob marley', 'chris brown', 'david bowie', 'katy perry', 'ed sheeran', 'foo fighters', 'janet jackson', 'lady gaga', 'michael jackson', 'ac/dc', 'the vamps', 'iron maiden', 'rolling stone', 'jay-z', 'snoop dogg', 'nirvana'])
sports = np.array(['arsenal', 'barca', 'federer', 'floyd mayweather', 'hulk hogan', 'john cena', 'kris bryant', 'randy orton', 'real madrid', 'serena', 'messi', 'david beckham', 'rousey', 'super eagles', 'kane', 'red sox', 'white sox'])
all_topics = np.concatenate((tech, politics, music, sports))
return [all_topics, genre]
In [14]:
def word2topic_preprocess():
word2topic = pickle.load(open("word2topic", "r"))
keys = word2topic.keys()
return keys
In [ ]:
def getTopicId(topic):
return all_topics.tolist().index(topic)
In [16]:
def word2vec_preprocess(df):
from gensim.models import Word2Vec
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
level=logging.INFO)
import gensim.models.word2vec as wv
model = wv.Word2Vec(df["tokenized_sents"], size=100, window=5, min_count=5, workers=4)
model.save("word2vec")
model = Word2Vec.load("word2vec")
model.init_sims(replace=True)
return model
In [ ]:
#add one hot
def one_hot_encoding():
one_hot = pd.get_dummies(all_topics)
return one_hot
In [ ]:
other methods
In [15]:
def getEmbeddingWord2Topic(sentence) :
num_of_words =30
size_of_vec = 100
embedding_size = num_of_words * size_of_vec
list = np.array([])
for word in sentence:
if word in keys:
list = np.append(list, word2topic[word])
if(list.size > embedding_size):
list = list[0:embedding_size]
pad = np.zeros(embedding_size - list.size)
list = np.append(list, pad)
return list
In [17]:
def getEmbedding(sentence, ) :
num_of_words =30
size_of_vec = 100
embedding_size = num_of_words * size_of_vec
list = np.array([])
for word in sentence:
if word in model.wv.vocab:
list = np.append(list, model.wv[word])
if(list.size > embedding_size):
list = list[0:embedding_size]
pad = np.zeros(embedding_size - list.size)
list = np.append(list, pad)
return list
In [ ]:
def logistic_regression(X,Y):
print "logistic Regression"
from sklearn.linear_model import LogisticRegression
logregr = LogisticRegression()
logregr.fit(X, Y)
pred = logregr.predict(X_test)
In [20]:
def add_features_to_df():
#filter and add topic id
df_filter = df[df["topic"].isin(all_topics)]
topics_array = np.array(([tech, politics, music, sports]))
df_filter['topic_id'] = df_filter['topic'].apply(getTopicId)
return df_filter
In [21]:
def add_one_hot_encoding():
df_filter['vector'] = df_filter['embedding'] # + one_hot[df['topic']].T
for index, row in df_filter.iterrows():
one_hot_encoding = one_hot[row['topic']]
row['vector'] = np.concatenate([row['vector'], one_hot_encoding])
In [ ]:
[X, y, df, d] = pickle.load(open("data_rnn", "r"))
[all_topics,genre] = get_all_topics()
df['tokenized_sents'] = df.apply(lambda row: nltk.word_tokenize(row['tweet']), axis=1)
In [ ]:
##to get word2Vec embedding
## preprocessing
model =word2vec_preprocess(df)
df['embedding'] = df['tokenized_sents'].apply(getEmbedding)
In [22]:
keys = word2topic_preprocess()
df['embedding'] = df['tokenized_sents'].apply(getEmbeddingWord2Topic)
In [ ]:
df_filter = add_features_to_df()
In [ ]:
import copy
#new_list = copy.deepcopy(old_list)
X =copy.deepcopy( np.vstack(df_filter['embedding'][0:5000]))
X_test = copy.deepcopy(np.vstack(df_filter['embedding'][5001:6357]))
Y = copy.deepcopy(df_filter['sentiment'][0:5000])
Y_test=copy.deepcopy(df_filter['sentiment'][5001:6357])